knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
# Set workspace -----------------------------------------------------------
library(magrittr)
library(tidyverse)
library(reshape)
library(readxl)
library(limma)
library(edgeR)
library(biomaRt)
library(AnnotationHub)
library(GenomicRanges)
library(GenomicFeatures)
library(rtracklayer)
library(fgsea)
library(RUVSeq)
library(ngsReports)
library(metap)
library(harmonicmeanp)
library(ggpubr)
library(RColorBrewer)
library(grid)
library(gridExtra)
library(scales)
library(knitr)
library(kableExtra)
library(pheatmap)
library(ggrepel)
library(ggeasy)
library(ggfortify) # to allow ggplot2::autoplot() to work
library(pathview)
library(VennDiagram)
library(pander)
library(UpSetR)
Here is my analysis for the sorl1 4-way, 6 month RNA-seq experiment. This analysis investigates the effects of the mutations V1482Afs, and R122Pfs (null) in sorl1 on 6 month old zebrafish brains. V1482Afs models the C1478* mutation in human SORL1 which was described in Pottier et al. (2012). R122Pfs is a putative null mutation which produces a premature stop codon before any of the coding sequences which encode the functional domains of the zebrafish Sorl1 protein. The family of zebrafish in this analysis arise from a pair-mating between fish of genotypes R122Pfs/+ x V1482Afs/+. Therefore, fish in the family have genotypes R122Pfs/+ (null/+), V1482Afs/+ (EOfAD-like), R122Pfs/V1482Afs (transheterozygous) and +/+ (non-mutant). The family was raised together in the same tanks to reduce environmental variation. Since the family contained ~100 fish, they were distributed over 3 tanks as to not stress them by overcrowding.
When the family was 6 months of age, 50 fish were sacrificed and had their brains removed. Genotyping was performed by PCR then 24 fish of equal genotypes and sex had brain total RNA extracted and DNase I treatment. Then the DNaseI treated total RNA was sent to SAHMRI for RNA-seq (polyA+, single ended, 75bp insert). The resulting fastq files were then processed using bash scripts to produce a counts matrix (kallisto was used to estimate transcript abundances). Kallisto settings were mean frag length = 300, sd = 60, 50 bootstraps. The index file used for the kallisto pseudo-alignment was generated from the zebrafish transcriptome according to the primary assembly of the GRCz11 reference (Ensembl release 96), with the sequences for unspliced transcripts additionally included.None of the sorl1 alternate transcripts were added to the transcriptome as sorl1 is a long gene, and kallisto couldnt tell the alternate transcripts apart (in a previous draft analysis not shown here),
gr <- import.gff("ftp://ftp.ensembl.org/pub/release-96/gtf/danio_rerio/Danio_rerio.GRCz11.96.gtf.gz")
zfGeneNames <-
gr %>%
mcols %>%
subset(type == "gene") %>%
as_tibble() %>%
dplyr::select(c(gene_id, gene_name))
tr2gn <- gr %>%
subset(!is.na(transcript_id)) %>%
subset(type == "transcript") %>%
mcols() %>%
as.data.frame() %>%
dplyr::select(gene_id, transcript_id) %>%
as_tibble()
# also need length and gc content
ah <- AnnotationHub() %>%
subset(species == "Danio rerio") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH74989"]]
grTrans <- transcripts(ensDb)
trLengths <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
mcols(grTrans)$length <- trLengths[names(grTrans)]
gcGene <- grTrans %>%
mcols() %>%
as.data.frame() %>%
dplyr::select(gene_id, tx_id, gc_content, length) %>%
as_tibble() %>%
group_by(gene_id) %>%
summarise(
gc_content = sum(gc_content*length) / sum(length),
length = ceiling(median(length))
)
grGenes <- genes(ensDb)
mcols(grGenes) %<>%
as.data.frame() %>%
left_join(gcGene) %>%
as.data.frame() %>%
DataFrame()
This is the speadsheet containing all the info of the fish.
meta <- read_excel("meta_4way_6m_RNA-seq.xlsx") %>%
dplyr::filter(!is.na(Position)) %>% # filter out the ones i didnt use
dplyr::select(c(Sex, Genotype, Tank, Batch_killed, RNA_Seq_id, new_sample_id)) %>%
mutate(
Sex = as.factor(Sex),
Genotype = as.factor(Genotype),
Tank = as.factor(Tank),
Batch_killed = as.factor(Batch_killed)
) %>%
mutate(
sample = RNA_Seq_id %>%
str_replace_all(pattern = "_", replacement = "-"),
Genotype_forPub = case_when(
Genotype == "R122Pfs/+" ~ "null/+",
Genotype == "trans" ~ "transhet",
Genotype == "V1482Afs/+" ~ "V1482Afs/+",
Genotype == "WT" ~ "+/+"),
Sex = case_when(
Sex == "male" ~ "Male",
Sex == "fem" ~ "Female")
) %>%
mutate(Genotype_2 = as.factor(Genotype %>%
str_remove(pattern = "/\\+")
)
)
Genes which are lowly expressed in RNA-seq data are not informative for differential gene expression analysis. Since one count in one RNA-seq library and 3 counts in another library does not necesserily mean a 3 fold change. Therefore we need to filter out any genes which are lowly expressed. A general rule for filtering is having a cpm above 10/(min_lib_size/1000000) in at least half of the RNA-seq libraries. (0.66 cpm).
# # Now can make the gene DGE object
counts <- list.files(path = here::here("kallisto_out/"), full.names = TRUE) %>%
catchKallisto()
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//01-transhet-1_S1_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//02-V1482Afs-4_S2_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//03-WT-1_S3_merged_R1_001, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//04-R122Pfs-1_S4_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//05-transhet-4_S5_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//06-transhet-2_S6_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//07-WT-5_S7_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//08-WT-3_S8_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//09-transhet-3_S9_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//10-R122Pfs-4_S10_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//11-transhet-5_S11_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//12-V1482Afs-1_S12_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//13-R122Pfs-6_S13_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//14-V1482Afs-2_S14_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//15-WT-9_S15_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//16-R122Pfs-3_S16_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//17-V1482Afs-5_S17_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//18-R122Pfs-2_S18_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//19-transhet-6_S19_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//20-WT-6_S20_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//21-R122Pfs-5_S21_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//22-V1482Afs-6_S22_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//23-V1482Afs-3_S23_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
## Reading /Users/karissabarthelson/Documents/sorl1_4way/sorl1_4way_RNASeq_6m/kallisto_out//24-WT-4_S24_merged_R1_001_T1.fastq.gz, 83803 transcripts, 50 bootstraps
colnames(counts$counts) %<>%
basename() %>%
str_replace_all(pattern = "_S(.+)_merged_R1_001", "") %>%
str_replace_all(pattern = "^(0|1|2).-", replacement = "") %>%
str_replace_all(pattern = "_T1.fastq.gz", replacement = "")
counts$counts %>%
as.data.frame() %>%
rownames_to_column("ensembl_transcript_id") %>%
write_csv("catchKallisto_output.csv")
geneDGE <-
counts$counts %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
dplyr::filter(!grepl("unspliced", transcript_id)) %>%
as_tibble() %>%
mutate(
transcript_id = str_remove_all(transcript_id, "\\.[0-9]+")
) %>%
gather(key = "Sample", value = "Counts", 2:25) %>%
left_join(tr2gn) %>%
group_by(Sample, gene_id) %>%
summarise(Counts = sum(Counts)) %>%
spread(key = "Sample", value = "Counts") %>%
column_to_rownames("gene_id") %>%
DGEList() %>%
calcNormFactors(method = "TMM")
geneDGE$samples %<>%
rownames_to_column("sample") %>%
left_join(meta, by = "sample")
#Set the genotype WT as the reference level for DE analysis later
geneDGE$samples$Genotype <- relevel(geneDGE$samples$Genotype, "WT")
geneDGE$genes <-
grGenes %>%
as_tibble() %>%
dplyr::select(gene_id, gene_id, gc_content, length, symbol, description) %>%
dplyr::filter(gene_id %in% rownames(geneDGE$counts)) %>%
arrange(gene_id) %>%
mutate(ensembl_gene_id = gene_id) %>%
column_to_rownames("ensembl_gene_id")
keepThesegenes <- (rowSums(cpm(geneDGE) > 0.66) >= 12)
#before filtering
geneDGE %>%
cpm(log = TRUE) %>%
melt %>%
dplyr::filter(is.finite(value)) %>%
ggplot(aes(x = value, colour = X2)) +
geom_density() +
guides(colour = FALSE) +
ggtitle("Before filtering") +
labs(x = "logCPM", y = "Proportion of Genes") +
theme_bw()
#after filtering
geneDGE <- geneDGE[keepThesegenes,, keep.lib.sizes=FALSE]
geneDGE %>%
cpm(log = TRUE) %>%
melt %>%
dplyr::filter(is.finite(value)) %>%
ggplot(aes(x = value, colour = X2)) +
geom_density() +
guides(colour = FALSE) +
ggtitle("After filtering") +
labs(x = "logCPM", y = "Proportion of Genes") +
theme_bw()
Library sizes afer filterling lowly expressed genes.
geneDGE$samples %>%
mutate(`Library size (millions)` = lib.size/1e6) %>%
ggplot(aes(new_sample_id, `Library size (millions)`, fill = Genotype_forPub)) +
geom_col() +
theme_bw() +
theme(legend.position = "none", aspect.ratio = 1) +
xlab("") +
easy_rotate_labels(which = "x", angle = 315) +
ggsave("plots/libsize.png", height = 3, width = 3.5, scale = 1.2)
The overall similarities between samples can be visualised by principle component analysis (PCA). Some clustering of samples is observed, *i.e the V1482Afs and R122Pfs samples. However, the WT and transhets appear to be quite variable.
pca_raw <- geneDGE$counts %>%
cpm(log = T) %>%
t() %>%
prcomp()
pca_raw %>%
autoplot(
data = tibble(sample = rownames(.$x)) %>%
left_join(geneDGE$samples),
colour = "Genotype_forPub",
shape = "Sex",
size = 4
) +
geom_text_repel(
aes(label = new_sample_id, colour = Genotype_forPub),
show.legend = FALSE
) +
theme_bw()
Here, I will use the generalised linear model functionality of edgeR and likelihood ratio tests to look for DE genes.
Design matrix is shown below. WT is set as the intercept (i.e. the reference level).
#design with WT as the intercept
design <- model.matrix(~Genotype + Sex + Tank, data = geneDGE$samples) %>%
set_colnames(gsub("Genotype", "", colnames(.)))
design %>%
cbind(geneDGE$samples %>% dplyr::select(sample)) %>%
remove_rownames() %>%
column_to_rownames("sample") %>%
kable(caption = "Design matrix. a 1 designates that this coefficient is active") %>%
kable_styling()
| (Intercept) | R122Pfs/+ | trans | V1482Afs/+ | SexMale | Tank2 | Tank3 | |
|---|---|---|---|---|---|---|---|
| R122Pfs-1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 |
| R122Pfs-2 | 1 | 1 | 0 | 0 | 1 | 0 | 1 |
| R122Pfs-3 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
| R122Pfs-4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| R122Pfs-5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| R122Pfs-6 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
| transhet-1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
| transhet-2 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| transhet-3 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| transhet-4 | 1 | 0 | 1 | 0 | 1 | 1 | 0 |
| transhet-5 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| transhet-6 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| V1482Afs-1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
| V1482Afs-2 | 1 | 0 | 0 | 1 | 1 | 1 | 0 |
| V1482Afs-3 | 1 | 0 | 0 | 1 | 1 | 0 | 1 |
| V1482Afs-4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
| V1482Afs-5 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| V1482Afs-6 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| WT-1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| WT-3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| WT-4 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
| WT-5 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| WT-6 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| WT-9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
The code below fits the GLM (negative binomial variance function).
fit <- geneDGE %>%
estimateDisp(design) %>%
glmFit(design)
The code below performs the tests for differential expression using likelihood ratio tests from edgeR.
topTables_tmm <- design %>% colnames() %>% .[2:4] %>%
sapply(function(x){
glmLRT(fit, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as_tibble() %>%
arrange(PValue) %>%
mutate(
coef = x,
DE = FDR < 0.05
) %>%
dplyr::select(
symbol, logFC, logCPM, PValue, FDR, DE, everything()
)
}, simplify = FALSE)
The tables below contain the DE genes in each comparison.
topTables_tmm$`V1482Afs/+` %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "DE genes, V1482Afs/+ vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| cox7a1 | -2.590368 | 1.440764 | 0 | 2.3e-06 | TRUE | ENSDARG00000069464 | 38.64198 | 570 | cytochrome c oxidase subunit 7A1 [Source:ZFIN;Acc:ZDB-GENE-060929-340] | 41.23949 | V1482Afs/+ |
topTables_tmm$`R122Pfs/+` %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "DE genes, null/+ vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| clca1 | -9.282431 | 4.3371714 | 0.0e+00 | 0.0000000 | TRUE | ENSDARG00000016290 | 41.11958 | 3412 | chloride channel accessory 1 [Source:ZFIN;Acc:ZDB-GENE-030131-6221] | 59.77174 | R122Pfs/+ |
| plin2 | -2.541092 | 2.5789109 | 0.0e+00 | 0.0000002 | TRUE | ENSDARG00000042332 | 48.52199 | 1306 | perilipin 2 [Source:ZFIN;Acc:ZDB-GENE-050913-12] | 44.88084 | R122Pfs/+ |
| pimr191 | 2.602969 | 1.7057146 | 0.0e+00 | 0.0000050 | TRUE | ENSDARG00000055683 | 49.24027 | 2106 | Pim proto-oncogene, serine/threonine kinase, related 191 [Source:ZFIN;Acc:ZDB-GENE-131119-3] | 37.56267 | R122Pfs/+ |
| itgae.1 | -6.534700 | 1.5276286 | 0.0e+00 | 0.0000300 | TRUE | ENSDARG00000098012 | 38.42025 | 2608 | integrin, alpha E, tandem duplicate 1 [Source:ZFIN;Acc:ZDB-GENE-131121-125] | 33.50384 | R122Pfs/+ |
| CABZ01061592.2 | -6.673181 | 0.7887449 | 9.8e-06 | 0.0331827 | TRUE | ENSDARG00000104631 | 40.63564 | 4405 | NULL | 19.54244 | R122Pfs/+ |
topTables_tmm$trans %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "DE genes,transhet vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| pimr191 | 2.9264454 | 1.705715 | 0.0e+00 | 0.0000000 | TRUE | ENSDARG00000055683 | 49.24027 | 2106 | Pim proto-oncogene, serine/threonine kinase, related 191 [Source:ZFIN;Acc:ZDB-GENE-131119-3] | 54.76235 | trans |
| sorl1 | -0.5620248 | 6.908258 | 0.0e+00 | 0.0000848 | TRUE | ENSDARG00000013892 | 49.25999 | 8581 | sortilin-related receptor, L(DLR class) A repeats containing [Source:ZFIN;Acc:ZDB-GENE-050208-22] | 32.83008 | trans |
| rab4b | 1.2364312 | 3.137467 | 5.0e-07 | 0.0023009 | TRUE | ENSDARG00000012177 | 38.37866 | 2970 | RAB4B, member RAS oncogene family [Source:ZFIN;Acc:ZDB-GENE-040822-15] | 25.37880 | trans |
| ftr90 | -2.3389031 | 2.146210 | 6.0e-07 | 0.0023009 | TRUE | ENSDARG00000097373 | 44.29315 | 2497 | finTRIM family, member 90 [Source:ZFIN;Acc:ZDB-GENE-131122-80] | 24.88251 | trans |
| dnase2 | -1.3971892 | 2.004203 | 7.0e-07 | 0.0023009 | TRUE | ENSDARG00000073893 | 39.93927 | 3129 | deoxyribonuclease II, lysosomal [Source:ZFIN;Acc:ZDB-GENE-010724-16] | 24.66467 | trans |
| cox7a1 | -1.8084452 | 1.440764 | 3.5e-06 | 0.0099195 | TRUE | ENSDARG00000069464 | 38.64198 | 570 | cytochrome c oxidase subunit 7A1 [Source:ZFIN;Acc:ZDB-GENE-060929-340] | 21.50480 | trans |
Below are some plots visualising the differential expression. #### MD plots
topTables_tmm %>%
bind_rows() %>%
ggplot(aes(x = logCPM, y = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = symbol),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("MD Plots of different sorl1 mutant comparisons to WT with TMM normalisation") +
scale_color_manual(values = c("grey50", "red"))
topTables_tmm %>%
bind_rows() %>%
ggplot(aes(y = -log10(PValue), x = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = symbol),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("Volcano Plots of different sorl1 mutant comparisons to WT with TMM normalisation") +
scale_color_manual(values = c("grey50", "red"))
A ranking statistic of the sign of the logFC multiplied by the log10 of the p-value was calculated for each gene. I plotted this ranking statistic against the gc content and length of each gene to see if there are any biases observed. If there was, I would use the cqn normalisation method to correct for this. No apparent biases were observed, therefore cqn is not appropriate.
Note that the limits for the y axis (ranking stat) are zoomed in for visualisation purposes.
topTables_tmm %>%
bind_rows() %>%
mutate(rankstat = logFC*log10(PValue)) %>%
ggplot(aes(x = length, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
scale_x_log10()+
coord_cartesian(ylim = c(-2.5, 2.5))
topTables_tmm %>%
bind_rows() %>%
mutate(rankstat = logFC*log10(PValue)) %>%
ggplot(aes(x = gc_content, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
coord_cartesian(ylim = c(-5,5))
To get a more complete idea of the changes to gene expression in the mutant brains, I will perform some gene set testing. Fry from the limma package, performs the fast approximation of the ROAST method. The ROAST method is a self contained gene set testing method, which tests whether any of the gene sets contains DE genes. Instead of permutation tests (like GSEA, which assumes that genes are independent from each other, which is not true), it performs a rotation, a Monte Carlo technology for multivariate regression and more accurately will predict whether a gene set is changed.
Here, I will perform FRY on the HALLMARK and KEGG pathway gene sets which were downloaded from MSigDB (http://www.broadinstitute.org/msigdb) as a .gmt file with human entrez ids. These human entrez ids will be converted to zebrafish ensembl ids using a mapping file downloaded fro biomart. Some genes were found not to have a comlimentary zebrafish ensembl id, and this resulted in some of the KEGG pathway gene sets having only a small number of genes. Therefore, only KEGG pathway gene sets with more than 10 genes after conversion to zf ids were retained.
# need the zf2human gene mapping file from my W1818x RNA-seq analysis
zf2humangeneMapping <- read_delim("gene_sets/zf2human_entrez.txt",delim = ",") %>%
set_colnames(c("hu_gene_id", "hu_gene_name", "Entrez", "gene_id", "gene_name"))
# import KEGG gene sets
KEGG <- gmtPathways("gene_sets/c2.cp.kegg.v7.1.entrez.gmt")
KEGG %<>%
lapply(function(x){
zf2humangeneMapping %>%
dplyr::filter(Entrez %in% x,
gene_id %in% rownames(geneDGE)) %>%
.[["gene_id"]] %>%
unique()
})
KEGG_sizes <- KEGG %>%
lapply(length) %>%
unlist %>%
as.data.frame() %>%
set_colnames( "n_genes") %>%
rownames_to_column("pathway")
# retain gene sets with at least 10 genes in it
KEGG <- KEGG[KEGG_sizes %>% dplyr::filter(n_genes > 10) %>% .$pathway]
# Hallmark gene sets
hallmark <- gmtPathways("gene_sets/h.all.v7.1.entrez.gmt") %>%
lapply(function(x){
zf2humangeneMapping %>%
dplyr::filter(Entrez %in% x,
gene_id %in% rownames(geneDGE)) %>%
.[["gene_id"]]
})
hallmark_sizes <- hallmark %>%
lapply(length) %>%
unlist %>%
as.data.frame() %>%
set_colnames( "n_genes") %>%
rownames_to_column("pathway")
The code below will perform the fry method for each comparison. No significant gene sets were observed.
fryRes_TMM_kg <-
design %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
cpm(geneDGE$counts, log = T) %>%
fry(
index = KEGG,
design = design,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05 | FDR.Mixed < 0.05,
p_bonf = p.adjust(PValue, "bonf"))
}, simplify = FALSE)
fryRes_TMM_hm <-
design %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
cpm(geneDGE$counts, log = T) %>%
fry(
index = hallmark,
design = design,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05 | FDR.Mixed < 0.05,
p_bonf = p.adjust(PValue, "bonf"))
}, simplify = FALSE)
fryRes_TMM_hm %>%
bind_rows() %>%
bind_rows(fryRes_TMM_kg %>% bind_rows()) %>%
dplyr::arrange(PValue) %>%
head(10) %>%
kable(caption = "top 10 most altered pathways. No pathways even approached being significantly altered.") %>%
kable_styling("basic")
| pathway | NGenes | Direction | PValue | FDR | PValue.Mixed | FDR.Mixed | coef | sig | p_bonf |
|---|---|---|---|---|---|---|---|---|---|
| KEGG_STARCH_AND_SUCROSE_METABOLISM | 29 | Down | 0.0182654 | 0.9993872 | 0.6315558 | 0.9486453 | R122Pfs/+ | FALSE | 1 |
| KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION | 44 | Up | 0.0447168 | 0.6854738 | 0.0932308 | 0.7578406 | V1482Afs/+ | FALSE | 1 |
| KEGG_TERPENOID_BACKBONE_BIOSYNTHESIS | 13 | Down | 0.0538450 | 0.9993872 | 0.6083643 | 0.9486453 | R122Pfs/+ | FALSE | 1 |
| KEGG_AMINO_SUGAR_AND_NUCLEOTIDE_SUGAR_METABOLISM | 40 | Down | 0.0542743 | 0.9993872 | 0.5688814 | 0.9486453 | R122Pfs/+ | FALSE | 1 |
| KEGG_RNA_POLYMERASE | 25 | Up | 0.0600471 | 0.9292446 | 0.3377184 | 0.8338230 | trans | FALSE | 1 |
| KEGG_THYROID_CANCER | 32 | Down | 0.0669904 | 0.9292446 | 0.4262375 | 0.8338230 | trans | FALSE | 1 |
| KEGG_MATURITY_ONSET_DIABETES_OF_THE_YOUNG | 12 | Up | 0.0692501 | 0.6854738 | 0.3422673 | 0.7578406 | V1482Afs/+ | FALSE | 1 |
| KEGG_PRIMARY_BILE_ACID_BIOSYNTHESIS | 18 | Down | 0.0729705 | 0.9993872 | 0.7813288 | 0.9486453 | R122Pfs/+ | FALSE | 1 |
| KEGG_STARCH_AND_SUCROSE_METABOLISM | 29 | Down | 0.0792493 | 0.9292446 | 0.5248509 | 0.8338230 | trans | FALSE | 1 |
| KEGG_BASAL_CELL_CARCINOMA | 58 | Up | 0.0823560 | 0.6854738 | 0.3407576 | 0.7578406 | V1482Afs/+ | FALSE | 1 |
Since the largest source of variation in this dataset appeared to be due to library size, I can use the RUV method to remove this variation. RUV has 3 methods of estimating the unwanted variation:
• RUVg uses negative control genes, assumed to have constant expression across samples.
• RUVs uses centered (technical) replicate/negative control samples for which the covariates of interest are constant.
• RUVr uses residuals, e.g., from a first-pass GLM regression of the counts on the covariates of interest.
Here, I will use the RUVg method using negative control genes, defining the 5000 least DE genes from an ANOVA-like test for differential expression using edgeR. 5000 genes are what was used in the RUVSeq vignette.
RUVneg <-
geneDGE %>%
estimateDisp(design) %>%
glmFit(design) %>%
glmLRT(coef = 2:4) %>% # use all 3 sorl1 genotype coefs
topTags(n = Inf, adjust.method = "fdr", sort.by = "p") %>%
.$table %>%
arrange(desc(PValue)) %>%
.[1:5000,] %>%
.$gene_id
### Perform the RUVg method with 1 factor of unwanted variation removed,
RUV_k1 <- geneDGE$counts %>%
round %>%
RUVg(RUVneg, 1)
PCA analysis was repeated on the RUV normalised counts. Some seperation across PC1 was observed for WT and V1482Afs/+ samples (except for sample WT-4). The null/+ samples were seperated from the V1482Afs samples, but seem to be within the WT cluster, suggesting the null samples have a similar gene expression profile to the WTs. The transhets were fairly dispersed across PC1, with transhet1 being quite distant from the rest (likely an outlier) I will retain transhet-1 at this stage. But will keep it in mind when i do the GSEAs later and look at whether it may be driving an enrichment of a gene set.
pca_RUV_k1 <- RUV_k1$normalizedCounts %>%
as.data.frame() %>%
cpm(log = T) %>%
t() %>%
prcomp()
pca_RUV_k1 %>%
autoplot(
data = tibble(sample = rownames(.$x)) %>%
left_join(geneDGE$samples),
colour = "Genotype_forPub",
shape = "Sex",
size = 4
) +
geom_text_repel(
aes(label = new_sample_id, colour = Genotype_forPub),
show.legend = FALSE
) +
theme_bw()
pca_RUV_k1$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(geneDGE$samples) %>%
mutate(lib.size = lib.size/1e6) %>%
ggplot(aes(PC1, lib.size)) +
geom_point(
aes(colour = Genotype_forPub),
size = 4
) +
geom_text_repel(
aes(label = new_sample_id, colour = Genotype_forPub),
show.legend = FALSE
) +
theme_bw()
I now will perform the DE analysis, including the W1 covariate in the model. To do this, I need to add in the W_1 value to the design matrix:
geneDGE$samples %<>%
cbind(RUV_k1$W)
design_RUVk1 <-
model.matrix(~Genotype + Sex + Tank + W_1, data = geneDGE$samples) %>%
set_colnames(gsub("Genotype", "", colnames(.)))
fit_RUVk1 <- geneDGE %>%
estimateDisp(design_RUVk1) %>%
glmFit(design_RUVk1)
topTables_RUVk1 <- design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x){
glmLRT(fit_RUVk1, coef = x) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
as_tibble() %>%
arrange(PValue) %>%
mutate(
coef = x,
DE = FDR < 0.05
) %>%
dplyr::select(
symbol, logFC, logCPM, PValue, FDR, DE, everything()
)
}, simplify = FALSE)
# # Make a copy of the Toptables for export
# topTables_RUVk1_forExport <- topTables_RUVk1
#
# # Change the names so that it can be written out
# names(topTables_RUVk1_forExport) <- c("Null", "V1482Afs", "transhet")
#
# writexl::write_xlsx(topTables_RUVk1_forExport, path = "DE_Res/Differential_Expression_Results_RUV_k1.xlsx")
#
# # Remove the copy
# remove(topTables_RUVk1_forExport)
The tables show the DE genes in each comparison
topTables_RUVk1$`V1482Afs/+` %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "DE genes, V1482Afs/+ vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| cox7a1 | -2.44458 | 1.430217 | 0 | 2e-07 | TRUE | ENSDARG00000069464 | 38.64198 | 570 | cytochrome c oxidase subunit 7A1 [Source:ZFIN;Acc:ZDB-GENE-060929-340] | 46.36492 | V1482Afs/+ |
topTables_RUVk1$`R122Pfs/+` %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "DE genes, null/+ vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| clca1 | -9.2261626 | 4.3361910 | 0.00e+00 | 0.0000000 | TRUE | ENSDARG00000016290 | 41.11958 | 3412 | chloride channel accessory 1 [Source:ZFIN;Acc:ZDB-GENE-030131-6221] | 67.43154 | R122Pfs/+ |
| plin2 | -2.5378695 | 2.5800775 | 0.00e+00 | 0.0000000 | TRUE | ENSDARG00000042332 | 48.52199 | 1306 | perilipin 2 [Source:ZFIN;Acc:ZDB-GENE-050913-12] | 49.91796 | R122Pfs/+ |
| pimr191 | 2.5882277 | 1.6968427 | 0.00e+00 | 0.0000018 | TRUE | ENSDARG00000055683 | 49.24027 | 2106 | Pim proto-oncogene, serine/threonine kinase, related 191 [Source:ZFIN;Acc:ZDB-GENE-131119-3] | 39.52453 | R122Pfs/+ |
| itgae.1 | -6.4329375 | 1.5490807 | 0.00e+00 | 0.0000035 | TRUE | ENSDARG00000098012 | 38.42025 | 2608 | integrin, alpha E, tandem duplicate 1 [Source:ZFIN;Acc:ZDB-GENE-131121-125] | 37.69974 | R122Pfs/+ |
| cuedc1b | 1.6632437 | 3.7044499 | 0.00e+00 | 0.0001495 | TRUE | ENSDARG00000054748 | 50.21272 | 1855 | CUE domain containing 1b [Source:ZFIN;Acc:ZDB-GENE-070424-29] | 29.95017 | R122Pfs/+ |
| krt15 | 1.2285960 | 2.2445444 | 1.00e-07 | 0.0002202 | TRUE | ENSDARG00000036840 | 51.77994 | 1545 | keratin 15 [Source:ZFIN;Acc:ZDB-GENE-040426-2931] | 28.84666 | R122Pfs/+ |
| nme2b.2 | 1.3118024 | 3.1409487 | 1.10e-06 | 0.0025711 | TRUE | ENSDARG00000099420 | 56.35019 | 811 | NME/NM23 nucleoside diphosphate kinase 2b, tandem duplicate 2 [Source:ZFIN;Acc:ZDB-GENE-000210-33] | 23.80276 | R122Pfs/+ |
| vmp1 | -0.3342902 | 5.0536939 | 1.50e-06 | 0.0028014 | TRUE | ENSDARG00000012450 | 44.06607 | 2060 | vacuole membrane protein 1 [Source:ZFIN;Acc:ZDB-GENE-030131-8733] | 23.17537 | R122Pfs/+ |
| ckmb | 3.1408574 | 1.5788179 | 1.50e-06 | 0.0028014 | TRUE | ENSDARG00000040565 | 51.30344 | 1155 | creatine kinase, muscle b [Source:ZFIN;Acc:ZDB-GENE-040426-2128] | 23.15420 | R122Pfs/+ |
| pvalb4 | 2.5604268 | 1.4538921 | 2.00e-06 | 0.0034177 | TRUE | ENSDARG00000024433 | 43.28125 | 1280 | parvalbumin 4 [Source:ZFIN;Acc:ZDB-GENE-040625-48] | 22.56954 | R122Pfs/+ |
| CABZ01061592.2 | -6.6443497 | 0.7894372 | 2.90e-06 | 0.0044594 | TRUE | ENSDARG00000104631 | 40.63564 | 4405 | NULL | 21.87571 | R122Pfs/+ |
| hsph1 | -0.4412418 | 4.2977371 | 4.60e-06 | 0.0064980 | TRUE | ENSDARG00000019874 | 42.96747 | 2137 | heat shock 105/110 protein 1 [Source:ZFIN;Acc:ZDB-GENE-120410-4] | 20.98701 | R122Pfs/+ |
| mylpfa | 2.5436823 | 1.0337324 | 1.73e-05 | 0.0224639 | TRUE | ENSDARG00000053254 | 47.27928 | 1388 | myosin light chain, phosphorylatable, fast skeletal muscle a [Source:ZFIN;Acc:ZDB-GENE-990712-15] | 18.46362 | R122Pfs/+ |
| ckma | 2.3173129 | 0.9323376 | 3.00e-05 | 0.0361320 | TRUE | ENSDARG00000035327 | 51.07323 | 1584 | creatine kinase, muscle a [Source:ZFIN;Acc:ZDB-GENE-980526-109] | 17.41798 | R122Pfs/+ |
| actc1b | 1.8400417 | 2.4912099 | 3.48e-05 | 0.0391573 | TRUE | ENSDARG00000099197 | 52.18688 | 1509 | actin alpha cardiac muscle 1b [Source:ZFIN;Acc:ZDB-GENE-000322-1] | 17.13410 | R122Pfs/+ |
topTables_RUVk1$trans %>%
dplyr::filter(DE == TRUE) %>%
kable(caption = "edgeR res, transhet vs WT comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| symbol | logFC | logCPM | PValue | FDR | DE | gene_id | gc_content | length | description | LR | coef |
|---|---|---|---|---|---|---|---|---|---|---|---|
| pimr191 | 2.9119015 | 1.6968427 | 0.00e+00 | 0.0000000 | TRUE | ENSDARG00000055683 | 49.24027 | 2106 | Pim proto-oncogene, serine/threonine kinase, related 191 [Source:ZFIN;Acc:ZDB-GENE-131119-3] | 57.18526 | trans |
| rab4b | 1.3300170 | 3.1397941 | 0.00e+00 | 0.0000000 | TRUE | ENSDARG00000012177 | 38.37866 | 2970 | RAB4B, member RAS oncogene family [Source:ZFIN;Acc:ZDB-GENE-040822-15] | 49.26888 | trans |
| sorl1 | -0.5517887 | 6.9083682 | 0.00e+00 | 0.0000002 | TRUE | ENSDARG00000013892 | 49.25999 | 8581 | sortilin-related receptor, L(DLR class) A repeats containing [Source:ZFIN;Acc:ZDB-GENE-050208-22] | 43.63512 | trans |
| cuedc1b | 1.7283209 | 3.7044499 | 0.00e+00 | 0.0000071 | TRUE | ENSDARG00000054748 | 50.21272 | 1855 | CUE domain containing 1b [Source:ZFIN;Acc:ZDB-GENE-070424-29] | 36.29816 | trans |
| dnase2 | -1.3898502 | 2.0092372 | 0.00e+00 | 0.0000260 | TRUE | ENSDARG00000073893 | 39.93927 | 3129 | deoxyribonuclease II, lysosomal [Source:ZFIN;Acc:ZDB-GENE-010724-16] | 33.34816 | trans |
| cox7a1 | -1.7848139 | 1.4302172 | 1.00e-07 | 0.0003670 | TRUE | ENSDARG00000069464 | 38.64198 | 570 | cytochrome c oxidase subunit 7A1 [Source:ZFIN;Acc:ZDB-GENE-060929-340] | 27.59662 | trans |
| ftr90 | -2.3769685 | 2.1453573 | 2.00e-07 | 0.0003670 | TRUE | ENSDARG00000097373 | 44.29315 | 2497 | finTRIM family, member 90 [Source:ZFIN;Acc:ZDB-GENE-131122-80] | 27.55958 | trans |
| kcnj13 | -1.0064297 | 3.7393156 | 2.10e-06 | 0.0045039 | TRUE | ENSDARG00000043443 | 42.04939 | 4192 | potassium inwardly-rectifying channel, subfamily J, member 13 [Source:ZFIN;Acc:ZDB-GENE-070129-1] | 22.46804 | trans |
| si:dkey-182i3.10 | 0.6066976 | 2.6173933 | 2.80e-06 | 0.0052135 | TRUE | ENSDARG00000096614 | 41.40127 | 2826 | si:dkey-182i3.10 [Source:ZFIN;Acc:ZDB-GENE-121214-231] | 21.96098 | trans |
| mknk2b | -0.5245210 | 6.0219074 | 3.90e-06 | 0.0066205 | TRUE | ENSDARG00000015164 | 46.77792 | 1053 | MAPK interacting serine/threonine kinase 2b [Source:ZFIN;Acc:ZDB-GENE-030829-2] | 21.30062 | trans |
| si:dkey-182i3.9 | -0.6433283 | 2.1609848 | 5.60e-06 | 0.0082224 | TRUE | ENSDARG00000096575 | 38.67394 | 4136 | si:dkey-182i3.9 [Source:ZFIN;Acc:ZDB-GENE-121214-310] | 20.60416 | trans |
| si:ch211-281l24.3 | -0.8305178 | 7.0830386 | 5.90e-06 | 0.0082224 | TRUE | ENSDARG00000068637 | 39.11385 | 2192 | si:ch211-281l24.3 [Source:ZFIN;Acc:ZDB-GENE-131121-575] | 20.53626 | trans |
| unc119a | 0.5912908 | 3.2184112 | 6.70e-06 | 0.0086379 | TRUE | ENSDARG00000034453 | 47.57945 | 603 | unc-119 homolog a (C. elegans) [Source:ZFIN;Acc:ZDB-GENE-090930-2] | 20.28874 | trans |
| zgc:154093 | -1.1473681 | 1.0915830 | 1.31e-05 | 0.0157194 | TRUE | ENSDARG00000055897 | 46.13977 | 2733 | zgc:154093 [Source:ZFIN;Acc:ZDB-GENE-061103-517] | 19.00311 | trans |
| chrna10a | 1.0368069 | 1.3416134 | 1.50e-05 | 0.0169005 | TRUE | ENSDARG00000011113 | 47.78426 | 1715 | cholinergic receptor, nicotinic, alpha 10a [Source:ZFIN;Acc:ZDB-GENE-060503-725] | 18.73331 | trans |
| phykpl | 1.0388267 | 1.2026550 | 1.77e-05 | 0.0186346 | TRUE | ENSDARG00000040566 | 43.61564 | 726 | 5-phosphohydroxy-L-lysine phospho-lyase [Source:ZFIN;Acc:ZDB-GENE-051127-33] | 18.42408 | trans |
| hsph1 | -0.3522810 | 4.2977371 | 2.71e-05 | 0.0268768 | TRUE | ENSDARG00000019874 | 42.96747 | 2137 | heat shock 105/110 protein 1 [Source:ZFIN;Acc:ZDB-GENE-120410-4] | 17.61145 | trans |
| aplp1 | -0.2642627 | 10.7731620 | 3.22e-05 | 0.0302083 | TRUE | ENSDARG00000098368 | 45.28713 | 5973 | amyloid beta (A4) precursor-like protein 1 [Source:ZFIN;Acc:ZDB-GENE-040319-1] | 17.28067 | trans |
| CABZ01061592.2 | -4.2473766 | 0.7894372 | 4.49e-05 | 0.0398559 | TRUE | ENSDARG00000104631 | 40.63564 | 4405 | NULL | 16.65184 | trans |
| alcamb | -0.6117084 | 2.6676668 | 5.19e-05 | 0.0437511 | TRUE | ENSDARG00000058538 | 41.55296 | 3078 | activated leukocyte cell adhesion molecule b [Source:ZFIN;Acc:ZDB-GENE-030131-1768] | 16.37778 | trans |
topTables_RUVk1 %>%
bind_rows() %>%
ggplot(aes(x = logCPM, y = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = symbol),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("MD Plots of different sorl1 mutant comparisons to WT with RUV k=1") +
scale_color_manual(values = c("grey50", "red"))
topTables_RUVk1 %>%
bind_rows() %>%
ggplot(aes(y = -log10(PValue), x = logFC, colour = DE)) +
geom_point(
alpha = 0.5
) +
facet_grid(rows = vars(coef)) +
theme_bw() +
geom_label_repel(
aes(label = symbol),
data = . %>% dplyr::filter(FDR < 0.05),
show.legend = FALSE
) +
theme(legend.position = "none") +
ggtitle("Volcano Plots of different sorl1 mutant comparisons to WT with RUV (k = 1)") +
scale_color_manual(values = c("grey50", "red"))
Many more genes were detected as DE now for the sorl1 null/+ and transhet samples. Statistical evidence was only observed for one DE gene in the EOfAD-like V1482Afs/+ mutant brains, cox7a1, a gene involved in the electron transport chain.
Next, I will check whether there have been any GC or length biases introduced due to RUVSeq. I generated below the same kind of plots I did earlier in this analysis. No bias’s appear to be introduced as each of the “fit” lines are straight along 0. Note that these graphs are zoomed in so I can see whether there are any weak biases.
No biases were observed still. Which is good.
topTables_RUVk1 %>%
bind_rows() %>%
mutate(rankstat = logFC*log10(PValue)) %>%
ggplot(aes(x = length, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
scale_x_log10()+
scale_y_continuous(limits = c(-2.5, 2.5))
topTables_RUVk1 %>%
bind_rows() %>%
mutate(rankstat = logFC*log10(PValue)) %>%
ggplot(aes(x = gc_content, y = rankstat)) +
geom_point(
aes(colour = DE),
alpha = 0.5
) +
geom_smooth(se = FALSE, method = "gam") +
facet_grid(rows = vars(coef)) +
theme_bw() +
theme(legend.position = "none") +
scale_color_manual(values = c("grey50", "red")) +
scale_y_continuous(limits = c(-5,5))
Next, I want to see if as a group, genes tend to show significant changes.
fryRes_RUVk1_kg <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = TRUE) %>%
fry(
index = KEGG,
design = design_RUVk1,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05 | FDR.Mixed < 0.05,
p_bonf = p.adjust(PValue, "bonf"))
}, simplify = FALSE)
fryRes_RUVk1_hm <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = TRUE) %>%
fry(
index = hallmark,
design = design_RUVk1,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05 | FDR.Mixed < 0.05,
p_bonf = p.adjust(PValue, "bonf"))
}, simplify = FALSE)
fryRes_RUVk1_hm %>%
bind_rows() %>%
bind_rows(fryRes_RUVk1_kg %>% bind_rows()) %>%
arrange(PValue.Mixed) %>%
head(20) %>%
kable(caption = "Most significant gene sets using fry (RUV data)") %>%
kable_styling("basic")
| pathway | NGenes | Direction | PValue | FDR | PValue.Mixed | FDR.Mixed | coef | sig | p_bonf |
|---|---|---|---|---|---|---|---|---|---|
| KEGG_OLFACTORY_TRANSDUCTION | 23 | Down | 0.4790467 | 0.9240815 | 0.0039789 | 0.6963102 | R122Pfs/+ | FALSE | 1 |
| KEGG_DILATED_CARDIOMYOPATHY | 87 | Down | 0.0910712 | 0.3622149 | 0.0048393 | 0.3301267 | trans | FALSE | 1 |
| KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION | 44 | Up | 0.0977542 | 0.8288377 | 0.0077226 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_HEMATOPOIETIC_CELL_LINEAGE | 29 | Up | 0.1859407 | 0.8288377 | 0.0085066 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| HALLMARK_IL2_STAT5_SIGNALING | 170 | Down | 0.0232968 | 0.2334531 | 0.0124745 | 0.3149356 | trans | FALSE | 1 |
| KEGG_VIRAL_MYOCARDITIS | 40 | Down | 0.0067184 | 0.2575902 | 0.0128170 | 0.3301267 | trans | FALSE | 1 |
| KEGG_GLYCOLYSIS_GLUCONEOGENESIS | 57 | Down | 0.0126542 | 0.7381629 | 0.0131621 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS | 28 | Down | 0.9275119 | 0.9717188 | 0.0131871 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_SELENOAMINO_ACID_METABOLISM | 26 | Down | 0.2247719 | 0.5292857 | 0.0143613 | 0.3301267 | trans | FALSE | 1 |
| HALLMARK_MYC_TARGETS_V2 | 60 | Down | 0.9064259 | 0.9185165 | 0.0145385 | 0.3149356 | trans | FALSE | 1 |
| KEGG_PPAR_SIGNALING_PATHWAY | 61 | Up | 0.1312984 | 0.8288377 | 0.0167056 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_HYPERTROPHIC_CARDIOMYOPATHY_HCM | 79 | Down | 0.1578463 | 0.4281959 | 0.0167987 | 0.3301267 | trans | FALSE | 1 |
| KEGG_FOCAL_ADHESION | 199 | Down | 0.0088317 | 0.2575902 | 0.0182385 | 0.3301267 | trans | FALSE | 1 |
| KEGG_PURINE_METABOLISM | 140 | Up | 0.9476591 | 0.9698266 | 0.0191120 | 0.3301267 | trans | FALSE | 1 |
| KEGG_GLYCOSPHINGOLIPID_BIOSYNTHESIS_GANGLIO_SERIES | 16 | Up | 0.1634845 | 0.9240815 | 0.0192433 | 0.8493961 | R122Pfs/+ | FALSE | 1 |
| KEGG_PRION_DISEASES | 25 | Down | 0.0068573 | 0.2575902 | 0.0211141 | 0.3301267 | trans | FALSE | 1 |
| HALLMARK_TNFA_SIGNALING_VIA_NFKB | 164 | Down | 0.0420216 | 0.2334531 | 0.0211943 | 0.3149356 | trans | FALSE | 1 |
| KEGG_RETINOL_METABOLISM | 29 | Up | 0.1875437 | 0.8288377 | 0.0263639 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM | 41 | Down | 0.1479551 | 0.8288377 | 0.0293704 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
| KEGG_GLYCINE_SERINE_AND_THREONINE_METABOLISM | 25 | Up | 0.1978493 | 0.8288377 | 0.0296937 | 0.5281057 | V1482Afs/+ | FALSE | 1 |
Still no significant pathways are observed. So I will try the “ensemble” method of GSEA, by performing 3 different methods of GSEA and combining raw p values by calculating their harmonic mean.
CAMERA (Correlation Adjusted MEan RAnk) is a competitive gene set test which accounts for inter-gene correlation. It tests whether the genes in the set are highly ranked in terms of differential expression relative to genes not in the set.
No gene sets were signifcantly altered in this gene set test. Some of the top gene sets are shown below.
camera_res_KEGG_RUVk1 <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = TRUE) %>%
camera(
index = KEGG,
design = design_RUVk1,
contrast = x,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05)
}, simplify = FALSE)
camera_res_hm_RUVk1 <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = TRUE) %>%
camera(
index = hallmark,
design = design_RUVk1,
contrast = x,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05)
}, simplify = FALSE)
camera_res_KEGG_RUVk1 %>%
bind_rows() %>%
bind_rows(camera_res_hm_RUVk1 %>% bind_rows()) %>%
arrange(PValue) %>%
head(10) %>%
kable(caption = "Top 10 DE pathways using the camera algorithm") %>%
kable_styling()
| pathway | NGenes | Correlation | Direction | PValue | FDR | coef | sig |
|---|---|---|---|---|---|---|---|
| KEGG_STARCH_AND_SUCROSE_METABOLISM | 29 | -0.0046737 | Down | 0.0009368 | 0.1639340 | R122Pfs/+ | FALSE |
| KEGG_PYRUVATE_METABOLISM | 38 | 0.0427810 | Down | 0.0052644 | 0.4606321 | R122Pfs/+ | FALSE |
| HALLMARK_MTORC1_SIGNALING | 210 | 0.0210840 | Down | 0.0063255 | 0.3162751 | R122Pfs/+ | FALSE |
| KEGG_CELL_ADHESION_MOLECULES_CAMS | 91 | 0.0015989 | Up | 0.0066042 | 0.9069268 | V1482Afs/+ | FALSE |
| KEGG_PRIMARY_BILE_ACID_BIOSYNTHESIS | 18 | 0.0224078 | Down | 0.0152056 | 0.7133472 | R122Pfs/+ | FALSE |
| KEGG_GLYCOLYSIS_GLUCONEOGENESIS | 57 | 0.0189784 | Down | 0.0165228 | 0.7133472 | R122Pfs/+ | FALSE |
| KEGG_MELANOMA | 62 | 0.0149220 | Down | 0.0171354 | 0.8497136 | trans | FALSE |
| KEGG_GLYCOLYSIS_GLUCONEOGENESIS | 57 | 0.0189784 | Down | 0.0193182 | 0.9069268 | V1482Afs/+ | FALSE |
| KEGG_VALINE_LEUCINE_AND_ISOLEUCINE_BIOSYNTHESIS | 12 | 0.0207579 | Down | 0.0199945 | 0.9069268 | V1482Afs/+ | FALSE |
| KEGG_RNA_POLYMERASE | 25 | 0.0697358 | Up | 0.0205439 | 0.8497136 | trans | FALSE |
Finally I will look at fgsea, This is the fast implementation of GSEA. All genes are ranked by differential expression (sign(logFC) x log10(PValue)), resulting in a ranked list starting from the most upregulated genes, ending in the most downregulated genes, and unchanged genes in the middle. The fgsea uses a Kolmogorov–Smirnov-like statistc called an enrichment score (ES), that represents the amount to which genes in a predefined geneset are overrepresented at either the start or end of the ranked list. fgsea then estimates the statistical significance of the ES. This calculation is done by a gene permutations in order to produce a null distribution for the ES. The P value is determined by comparison to the null distribution. Note that fgsea is quite naive in that it does not account for inter-gene correlation, therefore it will likely show false positives due to the nature of the permutations it performs.
# create a rank stat for fgsea
ranks_fgseaRUVk1 <-
sapply(topTables_RUVk1, function(x) {
x %>%
mutate(PValue_withsign = sign(logFC) * log10(1/PValue)) %>%
arrange(PValue_withsign) %>%
dplyr::select(c("gene_id", "PValue_withsign")) %>% #only want the Pvalue with sign
with(structure(PValue_withsign, names = gene_id)) %>%
rev() # reverse so the start of the list is upregulated genes
}, simplify = FALSE)
# save ranks forsimulation
# ranks_fgseaRUVk1 %>% saveRDS("R_objects/ranksRUVSeq.rds")
# Run fgsea
# This takes a while due to the number of permutations
# set a seed for a reproducible result
set.seed(33)
fgseaRes_RUVk1_kg <- ranks_fgseaRUVk1 %>%
lapply(function(x){
fgseaMultilevel(stats = x, pathways = KEGG) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_RUVk1_kg$`R122Pfs/+` %<>%
mutate(coef = "R122Pfs/+")
fgseaRes_RUVk1_kg$trans %<>%
mutate(coef = "trans")
fgseaRes_RUVk1_kg$`V1482Afs/+` %<>%
mutate(coef = "V1482Afs/+")
# Run fgsea
# This takes a while due to the number of permutations
set.seed(33)
fgseaRes_RUVk1_hm <- ranks_fgseaRUVk1 %>%
lapply(function(x){
fgseaMultilevel(stats = x, pathways = hallmark) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_RUVk1_hm$`R122Pfs/+` %<>%
mutate(coef = "R122Pfs/+")
fgseaRes_RUVk1_hm$trans %<>%
mutate(coef = "trans")
fgseaRes_RUVk1_hm$`V1482Afs/+` %<>%
mutate(coef = "V1482Afs/+")
fgseaRes_RUVk1_hm %>%
bind_rows() %>%
bind_rows(fgseaRes_RUVk1_kg %>%
bind_rows()) %>%
dplyr::filter(sig == T) %>%
dplyr::select(-leadingEdge) %>%
kable(caption = "Significant KEGG & HALLMARK gene sets using the fgsea algorithm. Gene sets were considered signiicant if they had a Bonferroni-adjusted p value of < 0.05.") %>%
kable_styling()
| pathway | pval | FDR | padj | log2err | ES | NES | size | sig | coef |
|---|---|---|---|---|---|---|---|---|---|
| HALLMARK_OXIDATIVE_PHOSPHORYLATION | 0.0000000 | 0.0000002 | 0.0000002 | 0.7614608 | -0.5166880 | -2.163813 | 225 | TRUE | R122Pfs/+ |
| HALLMARK_MTORC1_SIGNALING | 0.0000002 | 0.0000039 | 0.0000078 | 0.6901325 | -0.4848398 | -2.012731 | 210 | TRUE | R122Pfs/+ |
| HALLMARK_IL2_STAT5_SIGNALING | 0.0000121 | 0.0002024 | 0.0006072 | 0.5933255 | -0.4767654 | -1.915894 | 170 | TRUE | R122Pfs/+ |
| HALLMARK_ADIPOGENESIS | 0.0000206 | 0.0002575 | 0.0010299 | 0.5756103 | -0.4400068 | -1.816515 | 204 | TRUE | R122Pfs/+ |
| HALLMARK_FATTY_ACID_METABOLISM | 0.0000298 | 0.0002982 | 0.0014909 | 0.5756103 | -0.4716552 | -1.873050 | 158 | TRUE | R122Pfs/+ |
| HALLMARK_TNFA_SIGNALING_VIA_NFKB | 0.0000102 | 0.0005083 | 0.0005083 | 0.5933255 | -0.4864199 | -1.795507 | 164 | TRUE | trans |
| HALLMARK_IL2_STAT5_SIGNALING | 0.0000470 | 0.0011747 | 0.0023494 | 0.5573322 | -0.4599824 | -1.704567 | 170 | TRUE | trans |
| HALLMARK_HEME_METABOLISM | 0.0002376 | 0.0039607 | 0.0118820 | 0.5188481 | -0.4327065 | -1.608856 | 179 | TRUE | trans |
| HALLMARK_PI3K_AKT_MTOR_SIGNALING | 0.0005738 | 0.0071719 | 0.0286876 | 0.4772708 | -0.4758469 | -1.662581 | 104 | TRUE | trans |
| HALLMARK_HYPOXIA | 0.0009358 | 0.0093582 | 0.0467910 | 0.4772708 | -0.4072651 | -1.542389 | 208 | TRUE | trans |
| HALLMARK_OXIDATIVE_PHOSPHORYLATION | 0.0000000 | 0.0000000 | 0.0000000 | NA | -0.5293729 | -2.296493 | 225 | TRUE | V1482Afs/+ |
| HALLMARK_MYC_TARGETS_V1 | 0.0000154 | 0.0003853 | 0.0007707 | 0.5756103 | -0.3968557 | -1.723529 | 219 | TRUE | V1482Afs/+ |
| HALLMARK_ALLOGRAFT_REJECTION | 0.0005121 | 0.0069740 | 0.0256044 | 0.4772708 | 0.4389593 | 1.670693 | 112 | TRUE | V1482Afs/+ |
| HALLMARK_MTORC1_SIGNALING | 0.0006946 | 0.0069740 | 0.0347296 | 0.4772708 | -0.3497078 | -1.514592 | 210 | TRUE | V1482Afs/+ |
| HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION | 0.0006974 | 0.0069740 | 0.0348700 | 0.4772708 | 0.3773206 | 1.557518 | 190 | TRUE | V1482Afs/+ |
| KEGG_OXIDATIVE_PHOSPHORYLATION | 0.0000012 | 0.0002126 | 0.0002126 | 0.6435518 | -0.5487070 | -2.102000 | 120 | TRUE | R122Pfs/+ |
| KEGG_RIBOSOME | 0.0000092 | 0.0008025 | 0.0016050 | 0.5933255 | -0.6089914 | -2.151997 | 71 | TRUE | R122Pfs/+ |
| KEGG_PYRUVATE_METABOLISM | 0.0000151 | 0.0008823 | 0.0026470 | 0.5933255 | -0.7106757 | -2.246468 | 38 | TRUE | R122Pfs/+ |
| KEGG_PARKINSONS_DISEASE | 0.0000663 | 0.0029003 | 0.0116010 | 0.5384341 | -0.5098636 | -1.954922 | 123 | TRUE | R122Pfs/+ |
| KEGG_OXIDATIVE_PHOSPHORYLATION | 0.0000010 | 0.0001791 | 0.0001791 | 0.6435518 | 0.5218836 | 2.084416 | 120 | TRUE | trans |
| KEGG_SPLICEOSOME | 0.0001093 | 0.0082282 | 0.0191352 | 0.5384341 | 0.4346899 | 1.757224 | 130 | TRUE | trans |
| KEGG_MELANOMA | 0.0001411 | 0.0082282 | 0.0246847 | 0.5188481 | -0.5654522 | -1.847517 | 62 | TRUE | trans |
| KEGG_PYRIMIDINE_METABOLISM | 0.0002564 | 0.0100465 | 0.0448727 | 0.4984931 | 0.4750257 | 1.791782 | 80 | TRUE | trans |
| KEGG_OXIDATIVE_PHOSPHORYLATION | 0.0000000 | 0.0000000 | 0.0000000 | NA | -0.6013872 | -2.419244 | 120 | TRUE | V1482Afs/+ |
| KEGG_RIBOSOME | 0.0000000 | 0.0000030 | 0.0000059 | 0.7195128 | 0.6248822 | 2.258843 | 71 | TRUE | V1482Afs/+ |
| KEGG_PARKINSONS_DISEASE | 0.0000007 | 0.0000420 | 0.0001261 | 0.6594444 | -0.5033531 | -2.018534 | 123 | TRUE | V1482Afs/+ |
| KEGG_CITRATE_CYCLE_TCA_CYCLE | 0.0000022 | 0.0000955 | 0.0003821 | 0.6272567 | -0.7102651 | -2.302941 | 34 | TRUE | V1482Afs/+ |
| KEGG_PROTEASOME | 0.0000077 | 0.0002678 | 0.0013392 | 0.5933255 | -0.6334235 | -2.179186 | 47 | TRUE | V1482Afs/+ |
| KEGG_ALZHEIMERS_DISEASE | 0.0000349 | 0.0010170 | 0.0061022 | 0.5573322 | -0.4281906 | -1.772509 | 151 | TRUE | V1482Afs/+ |
Significant gene sets were only observed in the transhet for fry (probably the most correct method with the best power). Some siginicant gene sets were observed for the V1482Afs and null samples in the fgsea algorithm after bonferroni adjustment (which is a very strict threshold), so there may be something there.
Next, i will perform a meta analysis of the GSEAs by calculating the harmonic mean p value. This method corrects for the strong-sense family-wise error rate, and states whether groups of dependent p values are statistically significant.
harmonicPvals_hm <-
fryRes_RUVk1_hm %>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(camera_res_hm_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_RUVk1_hm %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1)),
wilkinson_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x)$p
}, numeric(1)),
wilkinson_p_2 = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x, r = 2)$p
}, numeric(1)),
wilkinson_p_3 = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x, r = 3)$p
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
wilk_p_FDR = p.adjust(wilkinson_p, "fdr"),
wilk_p_2_FDR = p.adjust(wilkinson_p_2, "fdr"),
wilk_p_3_FDR = p.adjust(wilkinson_p_3, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(wilkinson_p_2)
harmonicPvals_kg <-
fryRes_RUVk1_kg%>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(camera_res_KEGG_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_RUVk1_kg %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1)),
wilkinson_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x)$p
}, numeric(1)),
wilkinson_p_2 = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x, r = 2)$p
}, numeric(1)),
wilkinson_p_3 = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x, r = 3)$p
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
wilk_p_FDR = p.adjust(wilkinson_p, "fdr"),
wilk_p_2_FDR = p.adjust(wilkinson_p_2, "fdr"),
wilk_p_3_FDR = p.adjust(wilkinson_p_3, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(wilkinson_p_2)
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg) %>%
dplyr::filter(sig == TRUE & coef == "V1482Afs/+") %>%
kable(caption = "Significant gene sets in V1482Afs/+ brains (harmonic mean p FDR adjusted < 0.05)") %>%
kable_styling()
| pathway | coef | fry_p | camera_p | fgsea_p | harmonic_p | wilkinson_p | wilkinson_p_2 | wilkinson_p_3 | harmonic_p_FDR | wilk_p_FDR | wilk_p_2_FDR | wilk_p_3_FDR | sig |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| HALLMARK_MTORC1_SIGNALING | V1482Afs/+ | 0.0427354 | 0.1052653 | 0.0006946 | 0.0020782 | 0.0020823 | 0.0053229 | 0.0011664 | 0.0227777 | 0.0224008 | 0.1144951 | 0.0510597 | TRUE |
| HALLMARK_APICAL_JUNCTION | V1482Afs/+ | 0.1624615 | 0.0488034 | 0.0016904 | 0.0050611 | 0.0050626 | 0.0069128 | 0.0042880 | 0.0446570 | 0.0446703 | 0.1152142 | 0.0558904 | TRUE |
| HALLMARK_OXIDATIVE_PHOSPHORYLATION | V1482Afs/+ | 0.1115355 | 0.1691985 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0345455 | 0.0048438 | 0.0000000 | 0.0000000 | 0.1851435 | 0.0558904 | TRUE |
| HALLMARK_ALLOGRAFT_REJECTION | V1482Afs/+ | 0.1148330 | 0.1169782 | 0.0005121 | 0.0015466 | 0.0015355 | 0.0365313 | 0.0016007 | 0.0210895 | 0.0209383 | 0.1851435 | 0.0510597 | TRUE |
| HALLMARK_MYC_TARGETS_V1 | V1482Afs/+ | 0.5039341 | 0.2243305 | 0.0000154 | 0.0000463 | 0.0000462 | 0.1283940 | 0.1279738 | 0.0011567 | 0.0011560 | 0.3106307 | 0.2841949 | TRUE |
| HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION | V1482Afs/+ | 0.3526774 | 0.2939392 | 0.0006974 | 0.0021259 | 0.0020907 | 0.2084079 | 0.0438665 | 0.0227777 | 0.0224008 | 0.4083261 | 0.1848843 | TRUE |
| KEGG_CELL_ADHESION_MOLECULES_CAMS | V1482Afs/+ | 0.1061212 | 0.0066042 | 0.0003867 | 0.0011048 | 0.0011595 | 0.0001303 | 0.0011951 | 0.0341179 | 0.0351487 | 0.0204800 | 0.0330227 | TRUE |
| KEGG_GLYCOLYSIS_GLUCONEOGENESIS | V1482Afs/+ | 0.0131621 | 0.0193182 | 0.0005370 | 0.0015308 | 0.0016100 | 0.0005152 | 0.0000072 | 0.0414823 | 0.0419055 | 0.0450765 | 0.0037849 | TRUE |
| KEGG_CITRATE_CYCLE_TCA_CYCLE | V1482Afs/+ | 0.0322982 | 0.0470769 | 0.0000022 | 0.0000065 | 0.0000065 | 0.0030621 | 0.0001043 | 0.0005731 | 0.0005731 | 0.0937173 | 0.0152851 | TRUE |
| KEGG_ADHERENS_JUNCTION | V1482Afs/+ | 0.0784379 | 0.0468924 | 0.0005278 | 0.0015803 | 0.0015827 | 0.0063905 | 0.0004826 | 0.0414823 | 0.0419055 | 0.1266652 | 0.0211133 | TRUE |
| KEGG_PROTEASOME | V1482Afs/+ | 0.3299721 | 0.1723537 | 0.0000077 | 0.0000230 | 0.0000230 | 0.0788776 | 0.0359279 | 0.0017224 | 0.0017218 | 0.3022680 | 0.1598486 | TRUE |
| KEGG_OXIDATIVE_PHOSPHORYLATION | V1482Afs/+ | 0.2430565 | 0.2458999 | 0.0000000 | 0.0000000 | 0.0000000 | 0.1485116 | 0.0148688 | 0.0000002 | 0.0000002 | 0.4277462 | 0.1075328 | TRUE |
| KEGG_ALZHEIMERS_DISEASE | V1482Afs/+ | 0.3230226 | 0.2860488 | 0.0000349 | 0.0001047 | 0.0001046 | 0.1986604 | 0.0337053 | 0.0054986 | 0.0054918 | 0.4919657 | 0.1538722 | TRUE |
| KEGG_PARKINSONS_DISEASE | V1482Afs/+ | 0.3299820 | 0.3368124 | 0.0000007 | 0.0000022 | 0.0000022 | 0.2548021 | 0.0382089 | 0.0003782 | 0.0003782 | 0.5560050 | 0.1644273 | TRUE |
| KEGG_RIBOSOME | V1482Afs/+ | 0.4771629 | 0.4687473 | 0.0000000 | 0.0000001 | 0.0000001 | 0.4531820 | 0.1086425 | 0.0000266 | 0.0000266 | 0.7556810 | 0.2910068 | TRUE |
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg) %>%
dplyr::filter(sig == TRUE & coef == "R122Pfs/+") %>%
kable(caption = "Significant gene sets in null/+ brains (harmonic mean p FDR adjusted < 0.05)") %>%
kable_styling()
| pathway | coef | fry_p | camera_p | fgsea_p | harmonic_p | wilkinson_p | wilkinson_p_2 | wilkinson_p_3 | harmonic_p_FDR | wilk_p_FDR | wilk_p_2_FDR | wilk_p_3_FDR | sig |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| HALLMARK_MTORC1_SIGNALING | R122Pfs/+ | 0.2661002 | 0.0063255 | 0.0000002 | 0.0000005 | 0.0000005 | 0.0001195 | 0.0188424 | 0.0000233 | 0.0000233 | 0.0179295 | 0.1284708 | TRUE |
| HALLMARK_IL2_STAT5_SIGNALING | R122Pfs/+ | 0.1338565 | 0.1326548 | 0.0000121 | 0.0000364 | 0.0000364 | 0.0481231 | 0.0023984 | 0.0010934 | 0.0010930 | 0.1950939 | 0.0510597 | TRUE |
| HALLMARK_FATTY_ACID_METABOLISM | R122Pfs/+ | 0.1683572 | 0.1419083 | 0.0000298 | 0.0000895 | 0.0000895 | 0.0546984 | 0.0047719 | 0.0016787 | 0.0016772 | 0.2016982 | 0.0558904 | TRUE |
| HALLMARK_ADIPOGENESIS | R122Pfs/+ | 0.4622099 | 0.2215037 | 0.0000206 | 0.0000618 | 0.0000618 | 0.1254560 | 0.0987456 | 0.0013251 | 0.0013241 | 0.3084984 | 0.2553765 | TRUE |
| HALLMARK_XENOBIOTIC_METABOLISM | R122Pfs/+ | 0.3596106 | 0.2452983 | 0.0013260 | 0.0040835 | 0.0039727 | 0.1509940 | 0.0465048 | 0.0382830 | 0.0372445 | 0.3484477 | 0.1885328 | TRUE |
| HALLMARK_OXIDATIVE_PHOSPHORYLATION | R122Pfs/+ | 0.5380352 | 0.2534905 | 0.0000000 | 0.0000000 | 0.0000000 | 0.1601950 | 0.1557514 | 0.0000011 | 0.0000011 | 0.3639867 | 0.3066466 | TRUE |
| KEGG_PYRUVATE_METABOLISM | R122Pfs/+ | 0.0614810 | 0.0052644 | 0.0000151 | 0.0000453 | 0.0000454 | 0.0000828 | 0.0002324 | 0.0026405 | 0.0026469 | 0.0204800 | 0.0152851 | TRUE |
| KEGG_OXIDATIVE_PHOSPHORYLATION | R122Pfs/+ | 0.5945218 | 0.3316675 | 0.0000012 | 0.0000036 | 0.0000036 | 0.2570409 | 0.2101374 | 0.0003827 | 0.0003827 | 0.5560050 | 0.4178869 | TRUE |
| KEGG_PARKINSONS_DISEASE | R122Pfs/+ | 0.6136803 | 0.3546357 | 0.0000663 | 0.0001993 | 0.0001989 | 0.2880969 | 0.2311142 | 0.0095132 | 0.0094911 | 0.5954759 | 0.4428283 | TRUE |
| KEGG_RIBOSOME | R122Pfs/+ | 0.5016710 | 0.4576009 | 0.0000092 | 0.0000275 | 0.0000275 | 0.4365538 | 0.1262574 | 0.0018063 | 0.0018056 | 0.7441258 | 0.3099307 | TRUE |
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg) %>%
dplyr::filter(sig == TRUE & coef == "trans") %>%
kable(caption = "Significant gene sets in trans brains (harmonic mean p FDR adjusted < 0.05)") %>%
kable_styling()
| pathway | coef | fry_p | camera_p | fgsea_p | harmonic_p | wilkinson_p | wilkinson_p_2 | wilkinson_p_3 | harmonic_p_FDR | wilk_p_FDR | wilk_p_2_FDR | wilk_p_3_FDR | sig |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| HALLMARK_IL2_STAT5_SIGNALING | trans | 0.0124745 | 0.0907546 | 0.0000470 | 0.0001406 | 0.0001410 | 0.0004630 | 0.0007475 | 0.0023438 | 0.0023493 | 0.0347218 | 0.0510597 | TRUE |
| HALLMARK_TNFA_SIGNALING_VIA_NFKB | trans | 0.0211943 | 0.1318845 | 0.0000102 | 0.0000305 | 0.0000305 | 0.0013286 | 0.0022939 | 0.0010934 | 0.0010930 | 0.0664276 | 0.0510597 | TRUE |
| HALLMARK_PI3K_AKT_MTOR_SIGNALING | trans | 0.0658323 | 0.1476994 | 0.0005738 | 0.0017290 | 0.0017203 | 0.0124310 | 0.0032221 | 0.0216130 | 0.0215033 | 0.1245680 | 0.0532970 | TRUE |
| HALLMARK_HYPOXIA | trans | 0.0720119 | 0.1240488 | 0.0009358 | 0.0028230 | 0.0028048 | 0.0148103 | 0.0019089 | 0.0282301 | 0.0280484 | 0.1306787 | 0.0510597 | TRUE |
| HALLMARK_HEME_METABOLISM | trans | 0.0742018 | 0.1396451 | 0.0002376 | 0.0007151 | 0.0007128 | 0.0157006 | 0.0027232 | 0.0107262 | 0.0106913 | 0.1308385 | 0.0510597 | TRUE |
| KEGG_MELANOMA | trans | 0.0988569 | 0.0171354 | 0.0001411 | 0.0004212 | 0.0004231 | 0.0008708 | 0.0009661 | 0.0170102 | 0.0170870 | 0.0507966 | 0.0330227 | TRUE |
| KEGG_INSULIN_SIGNALING_PATHWAY | trans | 0.0518222 | 0.2926678 | 0.0003398 | 0.0010225 | 0.0010190 | 0.0077783 | 0.0250683 | 0.0341179 | 0.0341479 | 0.1266652 | 0.1361045 | TRUE |
| KEGG_PANCREATIC_CANCER | trans | 0.0582801 | 0.1045314 | 0.0006521 | 0.0019595 | 0.0019550 | 0.0097938 | 0.0011422 | 0.0439394 | 0.0433922 | 0.1380552 | 0.0330227 | TRUE |
| KEGG_MAPK_SIGNALING_PATHWAY | trans | 0.0763288 | 0.3930536 | 0.0006070 | 0.0018366 | 0.0018199 | 0.0165889 | 0.0607233 | 0.0438274 | 0.0433922 | 0.1527451 | 0.2189536 | TRUE |
| KEGG_REGULATION_OF_ACTIN_CYTOSKELETON | trans | 0.1054149 | 0.0771078 | 0.0003470 | 0.0010445 | 0.0010407 | 0.0169199 | 0.0011714 | 0.0341179 | 0.0341479 | 0.1527451 | 0.0330227 | TRUE |
| KEGG_SPLICEOSOME | trans | 0.1219252 | 0.0948070 | 0.0001093 | 0.0003287 | 0.0003280 | 0.0252608 | 0.0018125 | 0.0143793 | 0.0143499 | 0.1816702 | 0.0432531 | TRUE |
| KEGG_PYRIMIDINE_METABOLISM | trans | 0.1238372 | 0.1490330 | 0.0002564 | 0.0007729 | 0.0007690 | 0.0422087 | 0.0033101 | 0.0289824 | 0.0288393 | 0.2475084 | 0.0599250 | TRUE |
| KEGG_ENDOCYTOSIS | trans | 0.1385912 | 0.2405890 | 0.0006617 | 0.0020087 | 0.0019836 | 0.0522986 | 0.0139260 | 0.0439394 | 0.0433922 | 0.2590261 | 0.1029741 | TRUE |
| KEGG_MTOR_SIGNALING_PATHWAY | trans | 0.1716417 | 0.2378487 | 0.0004019 | 0.0012159 | 0.0012051 | 0.0782692 | 0.0134556 | 0.0354650 | 0.0351487 | 0.3021422 | 0.1009168 | TRUE |
| KEGG_OXIDATIVE_PHOSPHORYLATION | trans | 0.5334366 | 0.3863112 | 0.0000010 | 0.0000031 | 0.0000031 | 0.3324057 | 0.1517918 | 0.0003827 | 0.0003827 | 0.6322934 | 0.3391094 | TRUE |
| KEGG_HUNTINGTONS_DISEASE | trans | 0.5234449 | 0.4240937 | 0.0005591 | 0.0017015 | 0.0016762 | 0.3870153 | 0.1434210 | 0.0425379 | 0.0419055 | 0.6911793 | 0.3346490 | TRUE |
sigpaths <-
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg %>% bind_rows()) %>%
dplyr::filter(wilk_p_FDR < 0.05) %>%
.$pathway
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg %>% bind_rows()) %>%
dplyr::filter(pathway %in% sigpaths) %>%
ggplot(aes(coef, pathway, size =-log10(wilkinson_p))) +
geom_point(aes(colour = sig)) +
geom_text(
aes(label = harmonic_p_FDR %>% signif(1)),
size = 2.5,
vjust = 2.5
) +
labs(colour = "harmonic mean p < 0.05 \nafter FDR adjustment")+
ggpubr::theme_pubclean()+
theme(legend.position = "right")+
easy_rotate_labels(which = "x", angle =315) +
scale_color_manual(values = c("grey30", "turquoise3"))
# ggsave("plots/dotplot_RUVk1_HMandKG_harmonicP.png", width = 20, height = 30, units = "cm", dpi = 300)
sig_in <- harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg %>% bind_rows()) %>%
dplyr::filter(pathway %in% sigpaths) %>%
dplyr::select(pathway, harmonic_p_FDR, coef) %>%
spread(key = "coef", value = "harmonic_p_FDR") %>%
mutate(sig_in = case_when(
`R122Pfs/+` < 0.05 & trans < 0.05 & `V1482Afs/+` < 0.05 ~ "all",
`R122Pfs/+` > 0.05 & trans < 0.05 & `V1482Afs/+` < 0.05 ~ "trans&EOfAD",
`R122Pfs/+` < 0.05 & trans > 0.05 & `V1482Afs/+` < 0.05 ~ "null&EOfAD",
`R122Pfs/+` < 0.05 & trans < 0.05 & `V1482Afs/+` > 0.05 ~ "null&trans",
`R122Pfs/+` < 0.05 & trans > 0.05 & `V1482Afs/+` > 0.05 ~ "null only",
`R122Pfs/+` > 0.05 & trans < 0.05 & `V1482Afs/+` > 0.05 ~ "transOnly",
`R122Pfs/+` > 0.05 & trans > 0.05 & `V1482Afs/+` < 0.05 ~ "eofAD only",
)) %>%
dplyr::select(pathway, sig_in)
harmonicPvals_hm %>%
bind_rows(harmonicPvals_kg %>% bind_rows()) %>%
dplyr::filter(pathway %in% sigpaths) %>%
left_join(sig_in) %>%
ggplot(aes(coef, pathway, size =-log10(harmonic_p))) +
geom_point(aes(colour = sig)) +
geom_text(
aes(label = harmonic_p_FDR %>% signif(1)),
size = 2.5,
vjust = 2.5
) +
labs(colour = "harmonic mean p < 0.05 \nafter FDR adjustment")+
ggpubr::theme_pubclean()+
theme(legend.position = "right")+
easy_rotate_labels(which = "x", angle =315) +
facet_wrap(~sig_in, scales = "free", strip.position = "right") +
scale_color_manual(values = c("grey30", "turquoise3"))
# ggsave("plots/dotplotbySignifin.png", width = 40, units = "cm")
Nhi Hin, a PhD candidate in our lab, recently developed a method for testing for iron dyshomeostasis in RNA-seq data. (see https://www.biorxiv.org/content/10.1101/2020.05.01.071498v2 for the preprint article). I will use a modified version of this analysis, where i will use the RUV adjusted counts as input rather than a limma voom object as input for the GSEAs on the IRE gene sets. Also, rather than combining the raw p-values using Wilkinsons method (which assumes that each p value from each of the methods of GSEA are independent), i will use the harmonic mean p method, which does not have this assumption.
fry, camera, and fgsea are the GSEA algorithms,
zebrafishIreGenes <- readRDS("gene_sets/zebrafishIreGenes.rds")
Perform the individual tests
fry_ire_RUVk1 <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = T) %>%
fry(
index = zebrafishIreGenes,
design = design_RUVk1,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05)
}, simplify = FALSE)
camera_ire_RUV <-
design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = T) %>%
camera(
index = zebrafishIreGenes,
design = design_RUVk1,
contrast = x,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
sig = FDR < 0.05)
}, simplify = FALSE)
# Run fgsea
fgseaRes_ire_RUV <- ranks_fgseaRUVk1 %>%
lapply(function(x){
fgseaMultilevel(stats = x,
pathways = zebrafishIreGenes) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_ire_RUV$`R122Pfs/+` %<>%
mutate(coef = "R122Pfs/+")
fgseaRes_ire_RUV$trans %<>%
mutate(coef = "trans")
fgseaRes_ire_RUV$`V1482Afs/+` %<>%
mutate(coef = "V1482Afs/+")
harmonic_p_ire_RUV <-
fry_ire_RUVk1 %>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(camera_ire_RUV %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_ire_RUV %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")
) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 4)
}, numeric(1)),
wilkinson_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
wilkinsonp(x)$p
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
wilk_p_FDR = p.adjust(wilkinson_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p_FDR)
harmonic_p_ire_RUV %>%
mutate(
pathway = case_when(
pathway == "ire5_hq" ~ "Canonical 5' IRE",
pathway == "ire5_all" ~ "All predicted 5' IRE",
pathway == "ire3_hq" ~ "Canonical 3' IRE",
pathway == "ire3_all" ~ "All predicted 3' IRE"
)
) %>%
ggplot(aes(pathway, -log10(harmonic_p), fill = sig)) +
geom_col() +
facet_wrap(~coef) +
coord_flip() +
xlab("") +
scale_fill_manual(values = c("grey50", "turquoise3")) +
theme_bw() +
theme(strip.text = element_text(size = 5))
#ggsave("plots/IRE_HARMON_P.png", width = 10, height = 5, units = "cm", dpi = 300)
#png("plots/hq_3IRE_pheatmap_logFC.png", width = 5, height = 25, units = "cm", res = 300)
topTables_RUVk1 %>%
bind_rows() %>%
dplyr::filter(gene_id %in% zebrafishIreGenes$ire3_hq) %>%
dplyr::select(symbol, logFC, coef) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("symbol") %>%
pheatmap(
border_color = NA,
scale = "row",
height = 20,
main = "Canonical IRE 3'",
fontsize = 8,
width = 3,
cellwidth = 10,
treeheight_col = 0,
treeheight_row = 0,
show_rownames = T,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100)
)
#dev.off()
topTables_RUVk1 %>%
bind_rows() %>%
dplyr::filter(gene_id %in% zebrafishIreGenes$ire3_all) %>%
dplyr::select(symbol, logFC, coef) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("symbol") %>%
pheatmap(
border_color = NA,
scale = "row",
height = 5,
main = "All predicted IRE 3'",
fontsize = 8,
width = 3,
cellwidth = 10,
treeheight_col = 2,
treeheight_row = 0,
show_rownames = FALSE,
color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100)
)
I next want to further investigate whether changes to gene expression is observed consistent with a hypoxic response. Therefore, I will test the GROSS_HYPOXIA_VIA_HIF1A gene sets from MSigDB. Only the GROSS_HYPOXIA_VIA_HIF1A_DN gene set is altered in the trans samples.
hifTargets <- list(
GROSS_HYPOXIA_VIA_HIF1A_DN = gmtPathways("gene_sets/GROSS_HYPOXIA_VIA_HIF1A_DN.gmt") %>%
as.data.frame() %>%
set_colnames("hu_gene_name") %>%
left_join(zf2humangeneMapping) %>%
na.omit() %>%
.$gene_id,
GROSS_HYPOXIA_VIA_HIF1A_UP = gmtPathways("gene_sets/GROSS_HYPOXIA_VIA_HIF1A_UP.gmt") %>%
as.data.frame() %>%
set_colnames("hu_gene_name") %>%
left_join(zf2humangeneMapping) %>%
na.omit() %>%
.$gene_id
)
fry_Res_HIF <- design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = T) %>%
fry(
index = hifTargets,
design = design_RUVk1,
contrast = x,
sort = "directional"
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x,
)
}, simplify = FALSE)
cameraRes_HIF <- design_RUVk1 %>% colnames() %>% .[2:4] %>%
sapply(function(x) {
geneDGE %>%
cpm(log = TRUE) %>%
camera(
index = hifTargets,
design = design_RUVk1,
contrast = x,
inter.gene.cor = NA,
sort = TRUE
) %>%
rownames_to_column("pathway") %>%
as_tibble() %>%
mutate(coef = x)
}, simplify = FALSE)
fgseaRes_HIF <- ranks_fgseaRUVk1 %>%
lapply(function(x){
fgseaMultilevel(stats = x,
pathways = hifTargets) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::select(pathway, pval, FDR, padj, everything()) %>%
arrange(pval) %>%
mutate(sig = padj < 0.05)
})
fgseaRes_HIF$`R122Pfs/+` %<>%
mutate(coef = "R122Pfs/+")
fgseaRes_HIF$trans %<>%
mutate(coef = "trans")
fgseaRes_HIF$`V1482Afs/+` %<>%
mutate(coef = "V1482Afs/+")
harmonic_p_HIF <- fry_Res_HIF%>%
bind_rows() %>%
dplyr::select(pathway, PValue.Mixed, coef) %>%
dplyr::rename(fry_p = PValue.Mixed) %>%
left_join(cameraRes_HIF %>%
bind_rows() %>%
dplyr::select(pathway, PValue, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(camera_p = PValue) %>%
left_join(fgseaRes_HIF %>%
bind_rows() %>%
dplyr::select(pathway, pval, coef),
by = c("pathway", "coef")) %>%
dplyr::rename(fgsea_p = pval) %>%
bind_rows() %>%
nest(p = one_of(c("fry_p", "camera_p", "fgsea_p"))) %>%
mutate(
harmonic_p = vapply(p, function(x){
x <- unlist(x)
x <- x[!is.na(x)]
p.hmp(x, L = 3)
}, numeric(1))
) %>%
unnest() %>%
mutate(harmonic_p_FDR = p.adjust(harmonic_p, "fdr"),
sig = harmonic_p_FDR < 0.05) %>%
arrange(harmonic_p_FDR)
# png("plots/heatmaps/GROSS_HYPOXIA_DN.png", width = 5, height = 7, units = "cm", res = 300)
topTables_RUVk1 %>%
bind_rows() %>%
mutate(coef = dplyr::recode(coef, `R122Pfs/+` = "null/+")) %>%
dplyr::filter(gene_id %in% hifTargets$GROSS_HYPOXIA_VIA_HIF1A_DN) %>%
dplyr::select(symbol, logFC, coef) %>%
spread(key = "coef", value = "logFC") %>%
column_to_rownames("symbol") %>%
pheatmap(
border_color = NA,
scale = "row",
height = 5,
main = "GROSS_HYPOXIA_VIA_HIF1A_DN",
fontsize = 8,
width = 3,
cellwidth = 10,
treeheight_col = 2,
treeheight_row = 0,
show_rownames = FALSE,
cluster_cols = FALSE,
color = colorRampPalette(rev(brewer.pal(n = 7, name =
"RdBu")))(100)
)
#dev.off()
sessionInfo() %>%
pander
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8
attached base packages: grid, stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: ensembldb(v.2.12.1), AnnotationFilter(v.1.12.0), UpSetR(v.1.4.0), pander(v.0.6.3), VennDiagram(v.1.6.20), futile.logger(v.1.4.3), pathview(v.1.28.1), ggfortify(v.0.4.10), ggeasy(v.0.1.2), ggrepel(v.0.8.2), pheatmap(v.1.0.12), kableExtra(v.1.2.1), knitr(v.1.29), scales(v.1.1.1), gridExtra(v.2.3), RColorBrewer(v.1.1-2), ggpubr(v.0.4.0), harmonicmeanp(v.3.0), FMStable(v.0.1-2), metap(v.1.4), ngsReports(v.1.4.2), RUVSeq(v.1.22.0), EDASeq(v.2.22.0), ShortRead(v.1.46.0), GenomicAlignments(v.1.24.0), SummarizedExperiment(v.1.18.2), DelayedArray(v.0.14.1), matrixStats(v.0.56.0), Rsamtools(v.2.4.0), Biostrings(v.2.56.0), XVector(v.0.28.0), BiocParallel(v.1.22.0), fgsea(v.1.14.0), rtracklayer(v.1.48.0), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), AnnotationHub(v.2.20.2), BiocFileCache(v.1.12.1), dbplyr(v.1.4.4), BiocGenerics(v.0.34.0), biomaRt(v.2.44.1), edgeR(v.3.30.3), limma(v.3.44.3), readxl(v.1.3.1), reshape(v.0.8.8), forcats(v.0.5.0), stringr(v.1.4.0), dplyr(v.1.0.2), purrr(v.0.3.4), readr(v.1.3.1), tidyr(v.1.1.2), tibble(v.3.0.3), ggplot2(v.3.3.2), tidyverse(v.1.3.0) and magrittr(v.1.5)
loaded via a namespace (and not attached): rappdirs(v.0.3.1), R.methodsS3(v.1.8.1), bit64(v.4.0.5), aroma.light(v.3.18.0), multcomp(v.1.4-13), R.utils(v.2.10.1), data.table(v.1.13.0), hwriter(v.1.3.2), KEGGREST(v.1.28.0), RCurl(v.1.98-1.2), generics(v.0.0.2), lambda.r(v.1.2.4), TH.data(v.1.0-10), RSQLite(v.2.2.0), bit(v.4.0.4), mutoss(v.0.1-12), webshot(v.0.5.2), xml2(v.1.3.2), lubridate(v.1.7.9), httpuv(v.1.5.4), assertthat(v.0.2.1), xfun(v.0.16), hms(v.0.5.3), evaluate(v.0.14), promises(v.1.1.1), fansi(v.0.4.1), progress(v.1.2.2), Rgraphviz(v.2.32.0), DBI(v.1.1.0), geneplotter(v.1.66.0), tmvnsim(v.1.0-2), htmlwidgets(v.1.5.1), ellipsis(v.0.3.1), backports(v.1.1.9), FactoMineR(v.2.3), annotate(v.1.66.0), gbRd(v.0.4-11), vctrs(v.0.3.4), here(v.0.1), abind(v.1.4-5), withr(v.2.2.0), prettyunits(v.1.1.1), mnormt(v.2.0.1), cluster(v.2.1.0), lazyeval(v.0.2.2), crayon(v.1.3.4), genefilter(v.1.70.0), labeling(v.0.3), pkgconfig(v.2.0.3), nlme(v.3.1-149), ProtGenerics(v.1.20.0), rlang(v.0.4.7), lifecycle(v.0.2.0), sandwich(v.2.5-1), mathjaxr(v.1.0-1), modelr(v.0.1.8), cellranger(v.1.1.0), rprojroot(v.1.3-2), graph(v.1.66.0), Matrix(v.1.2-18), carData(v.3.0-4), Rhdf5lib(v.1.10.1), zoo(v.1.8-8), reprex(v.0.3.0), png(v.0.1-7), viridisLite(v.0.3.0), bitops(v.1.0-6), R.oo(v.1.24.0), blob(v.1.2.1), jpeg(v.0.1-8.1), rstatix(v.0.6.0), ggsignif(v.0.6.0), leaps(v.3.1), memoise(v.1.1.0), plyr(v.1.8.6), bibtex(v.0.4.2.2), zlibbioc(v.1.34.0), compiler(v.4.0.2), plotrix(v.3.7-8), KEGGgraph(v.1.48.0), cli(v.2.0.2), formatR(v.1.7), mgcv(v.1.8-33), MASS(v.7.3-52), tidyselect(v.1.1.0), stringi(v.1.4.6), highr(v.0.8), yaml(v.2.2.1), askpass(v.1.1), locfit(v.1.5-9.4), latticeExtra(v.0.6-29), fastmatch(v.1.1-0), tools(v.4.0.2), rio(v.0.5.16), rstudioapi(v.0.11), foreign(v.0.8-80), farver(v.2.0.3), scatterplot3d(v.0.3-41), digest(v.0.6.25), BiocManager(v.1.30.10), shiny(v.1.5.0), Rcpp(v.1.0.5), car(v.3.0-9), broom(v.0.7.0), BiocVersion(v.3.11.1), later(v.1.1.0.1), org.Hs.eg.db(v.3.11.4), httr(v.1.4.2), ggdendro(v.0.1.21), Rdpack(v.1.0.0), colorspace(v.1.4-1), rvest(v.0.3.6), XML(v.3.99-0.5), fs(v.1.5.0), truncnorm(v.1.0-8), splines(v.4.0.2), statmod(v.1.4.34), sn(v.1.6-2), multtest(v.2.44.0), plotly(v.4.9.2.1), xtable(v.1.8-4), jsonlite(v.1.7.0), futile.options(v.1.0.1), flashClust(v.1.01-2), R6(v.2.4.1), TFisher(v.0.2.0), pillar(v.1.4.6), htmltools(v.0.5.0), mime(v.0.9), glue(v.1.4.2), fastmap(v.1.0.1), DT(v.0.15), DESeq(v.1.39.0), interactiveDisplayBase(v.1.26.3), codetools(v.0.2-16), mvtnorm(v.1.1-1), lattice(v.0.20-41), numDeriv(v.2016.8-1.1), curl(v.4.3), zip(v.2.1.1), openxlsx(v.4.1.5), openssl(v.1.4.2), survival(v.3.2-3), rmarkdown(v.2.3), munsell(v.0.5.0), rhdf5(v.2.32.2), GenomeInfoDbData(v.1.2.3), haven(v.2.3.1), reshape2(v.1.4.4) and gtable(v.0.3.0)